J
(
θ
)
θ
J
(
θ
) =
E
(
x
,y
)
∼
ˆ
p
L
(
f
(
x
;
θ
)
,
y
)
,
L
f
(
x
;
θ
)
x
y
ˆ
p
p
J
∗
(
θ
) =
E
(
x
,y
)
∼
p
L
(
f
(
x
;
θ
)
,
y
)
.
P
J
(
θ
)
P
J
x
y
p
(
x
,
y
)
L
(
x
,
y
)
E
x
,y
∼
p
(
x
,y
)
[
L
(
x
,
y
)]
p
p
(
x
,
y
)
p
(
x
,
y
)
p
(
x
,
y
)
ˆ
p
(
x
,
y
)
E
x
,y
∼
ˆ
p
(
x
,y
)
[
L
(
f
(
x
;
θ
)
,
y
)] =
1
m
m
i
=1
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
m
θ
ML
= arg
max
θ
m
i
=1
log
p
(
x
(
i
)
;
θ
)
.
J
(
θ
) =
E
x
∼
ˆ
p
log
p
(
x
;
θ
)
.
J
∇
θ
J
(
θ
) =
E
x
∼
ˆ
p
∇
θ
log
p
(
x
;
θ
)
.
n
ˆ
σ
/
√
n,
ˆ
σ
m
1
m
•
•
•
•
•
g
H
H
−
1
g
H
g
H
−
1
g
H
J
(
x
)
x
(
x
,
y
)
p
(
x
,
y
)
y
L
(
f
(
x
;
θ
)
,
y
)
p
(
x
,
y
)
θ
f
(
·
;
θ
)
θ
J
∗
(
θ
) =
L
(
f
(
x
;
θ
)
,
y
)
dp
(
x
,
y
)
p
g
=
∂
J
∗
(
θ
)
∂
θ
=
∂
L
(
f
(
x
;
θ
)
,
y
)
∂
θ
dp
(
x
,
y
)
,
(
x
,
y
)
p
ˆ
g
=
∂
L
(
f
(
x
;
θ
)
,
y
)
∂
θ
.
ˆ
g
g
θ
ˆ
g
θ
=
θ
−
α
g
α
g
=
∇
θ
J
(
θ
)
.
J
(
θ
)
≈
J
(
θ
)
−
α
g
g
+
1
2
α
2
g
H
g
H
J
θ
−
α
g
g
1
2
α
2
g
H
g
H
g
g
H
g
g
g
H
g
g
H
g
i
j
m
n
n
!
m
α
1
/α
(
m
×
n
)
n
n
n
θ
θ
θ
f
=
f
T
◦
f
T
−
1
◦
.
.
.
,
f
2
◦
f
1
f
(
x
)
x
f
=
f
T
f
T
−
1
.
.
.
,
f
2
f
1
f
=
∂
f
(
x
)
∂
x
f
t
=
∂
f
t
(
a
t
)
∂
a
t
,
a
t
=
f
t
−
1
(
f
t
−
2
(
.
.
.
,
f
2
(
f
1
(
x
))))
y
x
α
α
T
α
<
1
∞
α
>
1
T
T
T
√
T
e
T
x
log
x
T
W
x
1
,
.
.
.
,
x
t
,
.
.
.
s
t
=
F
θ
(
s
t
−
1
,
x
t
)
s
t
F
θ
o
t
=
g
ω
(
s
t
)
,
L
t
t
o
t
y
t
L
T
T
θ
F
θ
∂
L
T
∂
θ
∂
L
T
∂
θ
=
t
≤
T
∂
L
T
∂
s
t
∂
s
t
∂
θ
∂
L
T
∂
θ
=
t
≤
T
∂
L
T
∂
s
T
∂
s
T
∂
s
t
∂
F
θ
(
s
t
−
1
,
x
t
)
∂
θ
θ
F
θ
s
t
=
F
θ
(
s
t
−
1
,
x
t
)
θ
s
t
−
1
θ
s
t
L
T
s
T
∂
s
T
∂
s
t
∂
s
T
∂
s
t
=
∂
s
T
∂
s
T
−
1
∂
s
T
−
1
∂
s
T
−
2
.
.
.
∂
s
t
+1
∂
s
t
∂
L
T
∂
θ
T
−
t
t
T
∂
s
t
∂
s
t
−
1
θ
[
x
(
t
)
,
y
(
t
)
]
θ
←
θ
+
∇
θ
t
L
(
f
(
x
(
t
)
;
θ
)
,
y
(
t
)
;
θ
)
,
µ
L
1
/
L
L
(1
−
µ
L
)
µ
O
(1
/k
)
k
L
µ
µ
m
E
[
ˆ
g
]
E
[
ˆ
g
] =
g
,
g
m
= 1
m
>
1
m
η
k
k
η
θ
m
{
x
(1)
,
.
.
.
,
x
(
m
)
}
ˆ
g
=
0
i
= 1
m
ˆ
g
←
ˆ
g
+
∇
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
/m
θ
←
θ
k
−
η
ˆ
g
m
∞
k
=1
η
k
=
∞
,
and
∞
k
=1
η
2
k
<
∞
.
k
µ
L
O
((1
−
µ
L
)
k
)
µ
O
(1
/k
)
O
(1
/k
2
)
O
(1
/
√
k
)
O
(1
/k
)
O
(1
/k
)
O
()
Gradien
ts
V
elocit
y
v
v
←
+
α
v
+
η
∇
θ
1
m
m
t
=1
L
(
f
(
x
(
t
)
;
θ
)
,
y
(
t
)
)
θ
←
θ
+
v
v
∇
θ
1
n
n
t
=1
L
(
f
(
x
(
t
)
;
θ
)
,
y
(
t
)
)
α
η
α
η
η
α
θ
v
m
{
x
(1)
,
.
.
.
,
x
(
m
)
}
g
=
0
i
= 1
m
g
←
g
+
∇
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
v
←
α
v
−
η
g
θ
←
θ
+
v
v
←
+
α
v
+
η
∇
θ
1
m
m
t
=1
L
f
(
x
(
t
)
;
θ
+
α
v
)
,
y
(
t
)
,
θ
←
θ
+
v
,
α
η
O
(1
/k
)
k
O
(1
/k
2
)
O
(1
−
µ
L
)
O
(1
−
µ
L
)
η
∇
θ
1
m
m
i
=1
L
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
η
∇
θ
1
m
m
i
=1
L
f
(
x
(
i
)
;
θ
+
α
v
)
,
y
(
i
)
α
v
+
η
∇
θ
1
m
m
i
=1
L
f
(
x
(
i
)
;
θ
+
α
v
)
,
y
(
i
)
α
v
Standard momentum
Previous velocity
Nesterov correction term
Nesterov accum
ulated gradien
t
η
α
θ
v
m
{
x
(1)
,
.
.
.
,
x
(
m
)
}
θ
←
θ
+
α
v
g
=
0
i
= 1
m
g
←
g
+
∇
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
v
←
α
v
−
η
g
θ
←
θ
+
v
η
θ
r
=
0
m
{
x
(1)
,
.
.
.
,
x
(
m
)
}
g
=
0
i
= 1
m
g
←
g
+
∇
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
r
←
r
+
g
2
∆
θ
←
−
η
√
r
g
1
√
r
θ
←
θ
+
∆
θ
t
ρ
η
ρ
θ
r
= 0
m
{
x
(1)
,
.
.
.
,
x
(
m
)
}
g
=
0
i
= 1
m
g
←
g
+
∇
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
r
←
ρ
r
+
(1
−
ρ
)
g
2
∆
θ
=
−
η
√
r
g
1
√
r
θ
←
θ
+
∆
θ
η
ρ
α
θ
v
r
=
0
m
{
x
(1)
,
.
.
.
,
x
(
m
)
}
θ
←
θ
+
α
v
g
=
0
i
= 1
m
g
←
g
+
∇
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
r
←
ρ
r
+
(1
−
ρ
)
g
2
v
←
α
v
−
η
√
r
g
1
√
r
θ
←
θ
+
v
α
ρ
1
ρ
2
θ
s
=
0
r
=
0
t
= 0
m
{
x
(1)
,
.
.
.
,
x
(
m
)
}
g
=
0
i
= 1
m
g
←
g
+
∇
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
t
←
t
+
1
s
←
ρ
1
s
+
(1
−
ρ
1
)
g
r
←
ρ
2
r
+
(1
−
ρ
2
)
g
2
ˆ
s
←
s
1
−
ρ
t
1
ˆ
r
←
r
1
−
ρ
t
2
∆
θ
=
−
α
s
√
r
+
g
θ
←
θ
+
∆
θ
θ
j
{
x
(
i
)
,
y
(
i
)
}
∆
θ
j
=
−
1
∂
2
∂
θ
2
j
L
(
f
(
x
(
i
)
;
θ
0
)
,
y
(
i
)
)
∂
∂
θ
j
L
(
f
(
x
(
i
)
;
θ
0
)
,
y
(
i
)
)
1
∂
2
∂
θ
2
j
L
(
f
(
x
(
i
)
;
θ
0
)
,
y
(
i
)
)
=
∆
θ
j
∂
∂
θ
j
L
(
f
(
x
(
i
)
;
θ
0
)
,
y
(
i
)
)
∆
θ
j
θ
j
θ
0
L
(
f
(
x
(
i
)
;
θ
0
+
e
j
∆
θ
j
)
,
y
(
i
)
)
≈
L
(
f
(
x
(
i
)
;
θ
0
)
,
y
(
i
)
) +
e
j
∂
∂
θ
j
L
(
f
(
x
(
i
)
;
θ
0
)
,
y
(
i
)
)
∆
θ
j
+
e
j
1
2
∂
2
∂
θ
2
j
L
(
f
(
x
(
i
)
;
θ
0
)
,
y
(
i
)
)
∆
θ
2
j
∆
θ
j
∆
θ
j
∆
θ
j
=
−
∂
∂
θ
j
L
(
f
(
x
(
i
)
;
θ
0
)
,
y
(
i
)
)
∂
2
∂
θ
2
j
L
(
f
(
x
(
i
)
;
θ
0
)
,
y
(
i
)
)
ρ
θ
r
=
0
s
=
0
m
{
x
(1)
,
.
.
.
,
x
(
m
)
}
g
=
0
i
= 1
m
g
←
g
+
∇
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
r
←
ρ
r
+
(1
−
ρ
)
g
2
∆
θ
=
−
√
s
+
√
r
+
g
s
←
ρ
s
+
(1
−
ρ
)
[∆
θ
]
2
θ
←
θ
+
∆
θ
J
(
θ
) =
E
x
,y
∼
ˆ
p
(
x
,y
)
[
L
(
f
(
x
;
θ
)
,
y
)] =
1
m
m
i
=1
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
.
J
(
θ
)
H
(
J
)(
θ
)
H
H
(
J
)(
θ
)
i,j
=
∂
2
∂
θ
i
∂
θ
j
J
(
x
;
θ
)
.
θ
J
J
(
θ
)
θ
0
J
(
θ
)
≈
J
(
θ
0
)
+
(
θ
−
θ
0
)
∇
θ
J
(
θ
0
)
+
1
2
(
θ
−
θ
0
)
H
(
J
)(
θ
0
)(
θ
−
θ
0
)
.
θ
∗
=
θ
0
−
[
H
(
J
(
θ
0
))]
−
1
∇
θ
J
(
θ
0
)
H
H
−
1
J
(
θ
)
=
1
m
m
i
=1
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
θ
0
g
=
0
H
=
0
i
= 1
m
g
←
g
+
1
m
∇
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
H
←
H
+
1
m
∇
2
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
H
−
1
∆
θ
t
=
H
−
1
g
θ
t
+1
=
θ
t
+
∆
θ
t
H
H
H
=
Q
Λ
Q
Q
θ
φ
φ
=
Λ
1
2
Q
θ
φ
∇
θ
f
(
θ
0
)
∇
φ
f
(
θ
0
)
∇
φ
f
(
θ
0
) =
∂
θ
∂
φ
∇
θ
f
(
θ
0
)
=
Q
Λ
1
2
∇
θ
f
(
θ
0
)
∇
θ
f
(
θ
0
) =
Λ
1
2
Q
∇
φ
f
(
θ
0
)
H
φ
Q
∀
i
Q
:
,i
Q
:
,i
=
1
Q
i,
:
Q
i,
:
= 1
i
=
j
Q
:
,i
Q
:
,j
= 0
Q
i,
:
Q
j,
:
= 0
0
0
0
0
Q
Λ
−
1
2
θ
2
θ
1
Λ
−
1
2
Q
⊤
θ
2
θ
1
φ
1
φ
2
φ
1
φ
2
θ
φ
f
(
θ
)
≈
f
(
θ
0
)
+
(
θ
−
θ
0
)
∇
θ
f
(
θ
0
)
+
1
2
(
θ
−
θ
0
)
H
(
θ
−
θ
0
)
=
f
(
θ
0
)
+
(
θ
−
θ
0
)
Q
Λ
1
2
∇
φ
f
(
θ
0
)
+
1
2
(
θ
−
θ
0
)
Q
Λ
Q
(
θ
−
θ
0
)
=
f
(
θ
0
)
+
Λ
1
2
Q
θ
−
Λ
1
2
Q
θ
0
∇
φ
f
(
θ
0
)
+
1
2
Λ
1
2
Q
θ
−
Λ
1
2
Q
θ
0
Λ
1
2
Q
θ
−
Λ
1
2
Q
θ
0
=
f
(
θ
0
)
+
(
φ
−
φ
0
)
∇
φ
f
(
θ
0
)
+
1
2
(
φ
−
φ
0
)
(
φ
−
φ
0
)
.
φ
φ
φ
∗
=
φ
0
−
∇
φ
f
(
θ
0
)
,
φ
θ
φ
α
θ
∗
=
θ
0
−
[
H
(
f
(
θ
0
))
+
α
I
]
−
1
∇
θ
f
(
θ
0
)
.
α
α
α
I
K
K
K
×
K
O
(
K
3
)
0
0
θ
1
θ
2
d
t
−
1
d
t
−
1
∇
θ
J
(
θ
)
·
d
t
−
1
=
0
d
t
=
∇
θ
J
(
θ
)
d
t
−
1
d
t
d
t
−
1
d
t
−
1
d
t
t
d
t
d
t
=
∇
θ
J
(
θ
)
+
β
t
d
t
−
1
β
t
d
t
−
1
d
t
d
t
−
1
d
t
H
(
J
)
d
t
−
1
= 0
φ
d
t
H
d
t
−
1
= 0
d
t
Q
Λ
Q
d
t
−
1
= 0
Λ
1
2
Q
d
t
−
1
Λ
1
2
Q
d
t
−
1
= 0
d
(
φ
)
t
d
(
φ
)
t
−
1
= 0
,
d
(
φ
)
t
−
1
=
Λ
1
2
Q
d
t
d
t
−
1
φ
φ
H
β
t
β
t
=
∇
θ
J
(
θ
t
)
∇
θ
J
(
θ
t
)
∇
θ
J
(
θ
t
−
1
)
∇
θ
J
(
θ
t
−
1
)
β
t
=
(
∇
θ
J
(
θ
t
)
−
∇
θ
J
(
θ
t
−
1
))
∇
θ
J
(
θ
t
)
∇
θ
J
(
θ
t
−
1
)
∇
θ
J
(
θ
t
−
1
)
k
k
θ
0
ρ
0
=
0
g
t
=
0
i
= 1
m
g
t
←
g
t
+
1
m
∇
θ
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
β
t
=
(
g
t
−
g
t
−
1
)
g
t
g
t
−
1
g
t
−
1
ρ
t
=
−
g
t
+
β
t
ρ
t
−
1
η
∗
= argmin
η
1
m
m
i
=1
L
(
f
(
x
(
i
)
;
θ
)
,
y
(
i
)
)
θ
t
+1
=
θ
t
+
η
∗
ρ
t
θ
∗
=
θ
0
−
[
H
(
J
(
θ
0
))]
−
1
∇
θ
J
(
θ
0
)
.
H
(
J
)(
θ
0
)
M
t
H
(
J
)
t
θ
t
+1
−
θ
t
=
−
H
−
1
(
∇
θ
J
(
θ
t
+1
)
−
∇
θ
J
(
θ
t
))
M
H
−
1
M
M
t
=
M
t
−
1
+
1
+
φ
M
t
−
1
φ
∆
φ
φ
φ
∆
φ
−
∆
φ
M
t
−
1
+
M
t
−
1
φ
∆
∆
φ
,
g
t
=
∇
θ
J
(
θ
t
)
φ
=
g
t
−
g
t
−
1
∆
=
θ
t
−
θ
t
−
1
θ
∈
R
n
O
(
n
2
)
M
t
ρ
t
ρ
t
=
M
t
g
t
η
∗
θ
t
+1
=
θ
t
+
η
∗
ρ
t
.
θ
0
M
0
=
I
g
t
=
∇
θ
J
(
θ
t
)
φ
=
g
t
−
g
t
−
1
∆
=
θ
t
−
θ
t
−
1
H
−
1
M
t
=
M
t
−
1
+
1
+
φ
M
t
−
1
φ
∆
φ
φ
φ
∆
φ
−
∆
φ
M
t
−
1
+
M
t
−
1
φ
∆
∆
φ
ρ
t
=
M
t
g
t
η
∗
= argmin
η
J
(
θ
t
+
η
ρ
t
)
θ
t
+1
=
θ
t
+
η
∗
ρ
t
M
O
(
n
2
)
M
M
t
−
1
ρ
t
=
−
g
t
+
b
∆
+
a
φ
,
a
b
a
=
−
1
+
φ
φ
∆
φ
∆
g
t
∆
φ
+
φ
g
t
∆
φ
b
=
∆
g
t
∆
φ
φ
∆
φ
∆
θ
θ
θ
x
y
x
∆
N
∆
N
≡
arg
min
∆
θ
E
ˆ
p
[
−
log
p
θ
+∆
θ
(
x
)]
s
.
t
.
KL
(
p
θ
(
x
)
p
θ
+∆
θ
(
x
)) = ∆KL
.
∆KL
∆
θ
→
0
X
log
p
θ
+∆
θ
θ
log
p
θ
+∆
θ
≈
log
p
θ
+
(
∇
log
p
θ
)
∆
θ
+
1
2
∆
θ
∇
2
log
p
θ
∆
θ
.
X
p
θ
∇
θ
log
p
θ
=
X
∇
θ
p
θ
=
∇
θ
X
p
θ
=
∇
θ
1 = 0
,
(
p
θ
p
θ
+∆
θ
)
(
p
θ
p
θ
+∆
θ
) =
X
p
θ
log
p
θ
−
X
p
θ
log
p
θ
+∆
θ
≈
X
p
θ
log
p
θ
−
X
p
θ
log
p
θ
+
(
∇
log
p
θ
)
∆
θ
+
1
2
∆
θ
∇
2
log
p
θ
∆
θ
=
−
1
2
∆
θ
E
p
θ
∇
2
log
p
θ
∆
θ
(
∇
log
p
θ
)
∆
θ
E
p
θ
−∇
2
log
p
θ
log
p
θ
E
p
θ
(
∇
log
p
θ
)
(
∇
log
p
θ
)
.
0 =
∇
2
θ
X
p
θ
=
X
∇
θ
(
p
θ
∇
θ
log
p
θ
) =
X
p
(
∇
θ
log
p
θ
)
∇
θ
log
p
θ
+
X
p
∇
2
θ
log
p
θ
log
p
θ
+∆
θ
L
N
(
θ
,
∆
θ
) =
E
ˆ
p
[
−
log
p
θ
]
+
E
ˆ
p
[
−∇
log
p
θ
]
+
λ
2
∆
θ
E
p
θ
−∇
2
log
p
θ
∆
θ
.
∆
θ
∇
∆
θ
L
N
(
θ
,
∆
θ
)
=
0
∆
θ
=
θ
t
+1
−
θ
t
θ
t
+1
=
θ
t
+
E
p
θ
−∇
2
log
p
θ
−
1
E
ˆ
p
[
−∇
log
p
θ
]
.
p
θ
ˆ
p
f
(
x
)
x
i
x
j
f
(
x
) =
(
x
1
−
x
2
)
2
+
α
x
2
1
+
y
2
1
α
α
θ
θ
0
p
(
θ
)
θ
0
θ
0
θ
0
m
n
U
(
−
1
√
m
,
1
√
n
)
W
i,j
∼
U
(
−
6
√
m
+
n
,
6
√
m
+
n
)
.
g
g
1
/
√
m
k
m
m
•
i
c
i
c
b
(
b
)
=
c
x
x
•
•
p
(
y
|
x
) =
N
(
y
|
w
T
x
+
b,
1
/β
)
β
k
δ
J
(
θ
)
{
J
(0)
,
.
.
.
,
J
(
n
)
}
J
(0)
J
(
n
)
J
(
θ
)
J
(
i
)
J
(
i
+1)
θ
J
(
θ
)
(
i
)
(
θ
) =
E
θ
∼N
(
θ
,
θ
,σ
(
i
)2
)
J
(
θ
)
T
rac
k lo
cal minima
Final solution
Easy to find minimum
J
(
θ
) =
−
θ
θ
J
(
i
)